
import click
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

from test_classifiers import get_feature_name, make_classifier_list

################################################################################
#
# MAIN
#
################################################################################

@click.command()
@click.option('--root', default="../../data/")
@click.option('--feature_set_name', default="wide")
@click.option('--n_folds', default=10)
@click.option('--scaling_on/--scaling_off', default=False)
def main(root, feature_set_name, n_folds, scaling_on):
    #
    # Example:
    # python make_test_set_preds.py --feature_set_name wide --scaling_off
    #
    print "--------------------------------------------------------------------"
    print "make_test_set_preds.py\n"
    print "root:", root
    print "feature_set_name:", feature_set_name
    print "scaling_on:", scaling_on
    print "n_folds:", n_folds
    print "--------------------------------------------------------------------"

    #
    # Read data
    #
    df = pd.read_csv("%s/features_labels_wide.csv" % root)

    # read test ids
    test_sample_fname = "%s/study_2/photo_ids.txt" % root
    test_file_ids = [int(line) for line in open(test_sample_fname)]

    # take a subset of the full dataset only with the test file_ids
    df_test = df[df['file_id'].isin(test_file_ids)]
    df_train = df[~df['file_id'].isin(test_file_ids)]
    assert df.shape[0] == (df_train.shape[0] + df_test.shape[0])

    # read feature names
    feature_names = get_feature_name(feature_set_name)

    # create training matrix
    X_train = df_train.as_matrix(columns=feature_names)
    y_train = (df_train['escaped'].values == "Y").flatten().astype(int)
    print "Training | X:", X_train.shape, "y:", y_train.shape

    # create test matrix
    X_test = df_test.as_matrix(columns=feature_names)
    y_test = (df_test['escaped'].values == "Y").flatten().astype(int)
    print "Testing  | X:", X_test.shape, "y:", y_test.shape

    # scale data
    if scaling_on:
        X_test = scale(X_test)
        X_train = scale(X_train)

    """
    #
    # Make sure that the performance of the classifier matches your expectations
    #
    inner_cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)
    outer_cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)

    clfs, clf_names = make_classifier_list(inner_cv, verbose=False)

    for clf, clf_name in zip(clfs, clf_names):
        res = cross_validate(
            clf,
            X_train,
            y_train,
            cv=outer_cv,
            scoring=["roc_auc", "accuracy"],
            return_train_score=True
        )

        print "--- %d-fold cross-validation" % n_folds
        print "%-25s | AUC | train: %.3f (%.3f) | test: %.3f (%.3f)" % (
                clf_name,
                res["train_roc_auc"].mean(), res["train_roc_auc"].std(),
                res["test_roc_auc"].mean(), res["test_roc_auc"].std())

        print "%-25s | ACC | train: %.3f (%.3f) | test: %.3f (%.3f)" % (
                clf_name,
                res["train_accuracy"].mean(), res["train_accuracy"].std(),
                res["test_accuracy"].mean(), res["test_accuracy"].std())
    """
    #
    # Make predictions on the test set
    #
    inner_cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)
    clfs, clf_names = make_classifier_list(inner_cv, verbose=True)

    for clf, clf_name in zip(clfs, clf_names):
        print "--------------------------"
        print "Classifier: %s" % clf_name

        model = clf.fit(X_train, y_train)

        if isinstance(model, GridSearchCV):
            print "params:", model.best_params_

        y_hat = model.predict(X_test)
        #y_hat_prob = model.predict_proba(X_test)

        print "--------------------------"
        #print "F1:       %.3f" % f1_score(y_test, y_hat)
        print "AUC:      %.3f" % roc_auc_score(y_test, y_hat)
        print "Accuracy: %.3f" % accuracy_score(y_test, y_hat)
        print "--------------------------\n"

        #
        # Output predictions to file
        #
        df_out = pd.DataFrame(
            data={
                "file_id": df_test["file_id"],
                #"prob_esc": y_hat_prob[:, 1],
                "pred_esc": ["Y" if i == 1 else "N" for i in y_hat],
                "true_esc": ["Y" if i == 1 else "N" for i in y_test]
            }
        )
        """
        df_out.to_csv(
            "%s/sample_2k_nested/machine_predictions_[%s].csv" % (root, clf_name),
            #columns=["file_id", "prob_esc", "pred_esc", "true_esc"],
            columns=["file_id", "pred_esc", "true_esc"],
            sep=",",
            float_format="%.3f",
            index=False
        )
        """

    print "Done!"

################################################################################

if __name__ == "__main__":
    main()

# END
